### Load standardpackages
library(tidyverse) # Collection of all the good stuff like dplyr, ggplot2 ect.
library(magrittr) # For extra-piping operators (eg. %<>%)
library(tidytext)

Download the data

# download and open some Trump tweets from trump_tweet_data_archive
library(jsonlite)
tmp <- tempfile()
download.file("https://github.com/SDS-AAU/SDS-master/raw/master/M2/data/pol_tweets.gz", tmp)
trying URL 'https://github.com/SDS-AAU/SDS-master/raw/master/M2/data/pol_tweets.gz'
Content type 'application/octet-stream' length 7342085 bytes (7.0 MB)
==================================================
downloaded 7.0 MB
tweets_raw <- stream_in(gzfile(tmp, "pol_tweets"))

 Found 1 records...
 Imported 1 records. Simplifying...
tweets_raw %>% glimpse()
Rows: 1
Columns: 2
$ text   <df[,50000]> <data.frame[1 x 50000]>
$ labels <df[,50000]> <data.frame[1 x 50000]>
tweets <- tibble(ID = colnames(tweets_raw[[1]]), 
                 text = tweets_raw[[1]] %>% as.character(), 
                 labels = tweets_raw[[2]] %>% as.logical())
#rm(tweets_raw)
tweets %>% head()
tweets %<>%
  filter(!(text %>% str_detect('^RT'))) # Filter retweets
tweets %>% head()

Tidying

tweets_tidy <- tweets %>%
  unnest_tokens(word, text, token = "tweets") 
Using `to_lower = TRUE` with `token = 'tweets'` may not preserve URLs.
tweets_tidy %>% head(50)
tweets_tidy %>% count(word, sort = TRUE)

Preprocessing

# preprocessing
tweets_tidy %<>%
  filter(!(word %>% str_detect('@'))) %>% # remove mentions
  filter(!(word %>% str_detect('^amp|^http|^t\\.co'))) %>% # Twitter specific stuff
#  mutate(word = word %>% str_remove_all('[^[:alnum:]]')) %>% ## remove all special characters
  filter(str_length(word) > 2 ) %>% # Remove words with less than  3 characters
  group_by(word) %>%
  filter(n() > 100) %>% # remove words occuring less than 100 times
  ungroup() %>%
  anti_join(stop_words, by = 'word') # remove stopwords

TFIDF weighting

# top words
tweets_tidy %>%
  count(word, sort = TRUE) %>%
  head(20)
# TFIDF topwords
tweets_tidy %>%
  count(word, wt = tf_idf, sort = TRUE) %>%
  head(20)

Inspecting

Words by party affiliation

labels_words <- tweets_tidy %>%
  group_by(labels) %>%
  count(word, wt = tf_idf, sort = TRUE, name = "tf_idf") %>%
  slice(1:100) %>%
  ungroup() 
labels_words %>%
  mutate(word = reorder_within(word, by = tf_idf, within = labels)) %>%
  ggplot(aes(x = word, y = tf_idf, fill = labels)) +
  geom_col(show.legend = FALSE) +
  labs(x = NULL, y = "tf-idf") +
  facet_wrap(~labels, ncol = 2, scales = "free") +
  coord_flip() +
  scale_x_reordered()

Sentiments?

sentiment_tweet <- tweets_tidy %>%
  inner_join(get_sentiments("bing"))

… To be continued by you

Towards prediction?

tweets_dtm %<>% mutate(across(everything(), .fns = ~replace_na(.,0))) 
rm(tweets_dtm)
LS0tCnRpdGxlOiAnTkxQIHdvcmtzaG9wIC0gRXhwbG9yaW5nIFByZXNpZGVudGlhbCBEZWJhdGUgb24gdHdpdHRlcicKYXV0aG9yOiAiRGFuaWVsIFMuIEhhaW4gKGRzaEBidXNpbmVzcy5hYXUuZGspIgpkYXRlOiAiVXBkYXRlZCBgciBmb3JtYXQoU3lzLnRpbWUoKSwgJyVCICVkLCAlWScpYCIKb3V0cHV0OgogIGh0bWxfbm90ZWJvb2s6CiAgICBjb2RlX2ZvbGRpbmc6IHNob3cKICAgIGRmX3ByaW50OiBwYWdlZAogICAgdG9jOiB0cnVlCiAgICB0b2NfZGVwdGg6IDIKICAgIHRvY19mbG9hdDoKICAgICAgY29sbGFwc2VkOiBmYWxzZQogICAgdGhlbWU6IGZsYXRseQotLS0KCmBgYHtyIHNldHVwLCBpbmNsdWRlPUZBTFNFfQojIyMgR2VuZXJpYyBwcmVhbWJsZQpybShsaXN0PWxzKCkpClN5cy5zZXRlbnYoTEFORyA9ICJlbiIpICMgRm9yIGVuZ2xpc2ggbGFuZ3VhZ2UKb3B0aW9ucyhzY2lwZW4gPSA1KSAjIFRvIGRlYWN0aXZhdGUgYW5ub3lpbmcgc2NpZW50aWZpYyBudW1iZXIgbm90YXRpb24KCiMjIyBLbml0ciBvcHRpb25zCmxpYnJhcnkoa25pdHIpICMgRm9yIGRpc3BsYXkgb2YgdGhlIG1hcmtkb3duCmtuaXRyOjpvcHRzX2NodW5rJHNldCh3YXJuaW5nPUZBTFNFLAogICAgICAgICAgICAgICAgICAgICBtZXNzYWdlPUZBTFNFLAogICAgICAgICAgICAgICAgICAgICBjb21tZW50PUZBTFNFLCAKICAgICAgICAgICAgICAgICAgICAgZmlnLmFsaWduPSJjZW50ZXIiCiAgICAgICAgICAgICAgICAgICAgICkKYGBgCgpgYGB7cn0KIyMjIExvYWQgc3RhbmRhcmRwYWNrYWdlcwpsaWJyYXJ5KHRpZHl2ZXJzZSkgIyBDb2xsZWN0aW9uIG9mIGFsbCB0aGUgZ29vZCBzdHVmZiBsaWtlIGRwbHlyLCBnZ3Bsb3QyIGVjdC4KbGlicmFyeShtYWdyaXR0cikgIyBGb3IgZXh0cmEtcGlwaW5nIG9wZXJhdG9ycyAoZWcuICU8PiUpCmBgYAoKYGBge3J9CmxpYnJhcnkodGlkeXRleHQpCmBgYAoKCiMgRG93bmxvYWQgdGhlIGRhdGEKCmBgYHtyfQojIGRvd25sb2FkIGFuZCBvcGVuIHNvbWUgVHJ1bXAgdHdlZXRzIGZyb20gdHJ1bXBfdHdlZXRfZGF0YV9hcmNoaXZlCmxpYnJhcnkoanNvbmxpdGUpCnRtcCA8LSB0ZW1wZmlsZSgpCmRvd25sb2FkLmZpbGUoImh0dHBzOi8vZ2l0aHViLmNvbS9TRFMtQUFVL1NEUy1tYXN0ZXIvcmF3L21hc3Rlci9NMi9kYXRhL3BvbF90d2VldHMuZ3oiLCB0bXApCgp0d2VldHNfcmF3IDwtIHN0cmVhbV9pbihnemZpbGUodG1wLCAicG9sX3R3ZWV0cyIpKQpgYGAKCmBgYHtyfQp0d2VldHNfcmF3ICU+JSBnbGltcHNlKCkKYGBgCgpgYGB7cn0KdHdlZXRzIDwtIHRpYmJsZShJRCA9IGNvbG5hbWVzKHR3ZWV0c19yYXdbWzFdXSksIAogICAgICAgICAgICAgICAgIHRleHQgPSB0d2VldHNfcmF3W1sxXV0gJT4lIGFzLmNoYXJhY3RlcigpLCAKICAgICAgICAgICAgICAgICBsYWJlbHMgPSB0d2VldHNfcmF3W1syXV0gJT4lIGFzLmxvZ2ljYWwoKSkKI3JtKHR3ZWV0c19yYXcpCmBgYAoKYGBge3J9CnR3ZWV0cyAlPiUgaGVhZCgpCmBgYAoKCmBgYHtyfQp0d2VldHMgJTw+JQogIGZpbHRlcighKHRleHQgJT4lIHN0cl9kZXRlY3QoJ15SVCcpKSkgIyBGaWx0ZXIgcmV0d2VldHMKYGBgCgpgYGB7cn0KdHdlZXRzICU+JSBoZWFkKCkKYGBgCgojIFRpZHlpbmcKCmBgYHtyfQp0d2VldHNfdGlkeSA8LSB0d2VldHMgJT4lCiAgdW5uZXN0X3Rva2Vucyh3b3JkLCB0ZXh0LCB0b2tlbiA9ICJ0d2VldHMiKSAKYGBgCgpgYGB7cn0KdHdlZXRzX3RpZHkgJT4lIGhlYWQoNTApCmBgYAoKCmBgYHtyfQp0d2VldHNfdGlkeSAlPiUgY291bnQod29yZCwgc29ydCA9IFRSVUUpCmBgYAoKCiMgUHJlcHJvY2Vzc2luZwoKYGBge3J9CiMgcHJlcHJvY2Vzc2luZwp0d2VldHNfdGlkeSAlPD4lCiAgZmlsdGVyKCEod29yZCAlPiUgc3RyX2RldGVjdCgnQCcpKSkgJT4lICMgcmVtb3ZlIG1lbnRpb25zCiAgZmlsdGVyKCEod29yZCAlPiUgc3RyX2RldGVjdCgnXmFtcHxeaHR0cHxedFxcLmNvJykpKSAlPiUgIyBUd2l0dGVyIHNwZWNpZmljIHN0dWZmCiMgIG11dGF0ZSh3b3JkID0gd29yZCAlPiUgc3RyX3JlbW92ZV9hbGwoJ1teWzphbG51bTpdXScpKSAlPiUgIyMgcmVtb3ZlIGFsbCBzcGVjaWFsIGNoYXJhY3RlcnMKICBmaWx0ZXIoc3RyX2xlbmd0aCh3b3JkKSA+IDIgKSAlPiUgIyBSZW1vdmUgd29yZHMgd2l0aCBsZXNzIHRoYW4gIDMgY2hhcmFjdGVycwogIGdyb3VwX2J5KHdvcmQpICU+JQogIGZpbHRlcihuKCkgPiAxMDApICU+JSAjIHJlbW92ZSB3b3JkcyBvY2N1cmluZyBsZXNzIHRoYW4gMTAwIHRpbWVzCiAgdW5ncm91cCgpICU+JQogIGFudGlfam9pbihzdG9wX3dvcmRzLCBieSA9ICd3b3JkJykgIyByZW1vdmUgc3RvcHdvcmRzCmBgYAoKIyBURklERiB3ZWlnaHRpbmcKCmBgYHtyfQojIHRvcCB3b3Jkcwp0d2VldHNfdGlkeSAlPiUKICBjb3VudCh3b3JkLCBzb3J0ID0gVFJVRSkgJT4lCiAgaGVhZCgyMCkKYGBgCgpgYGB7cn0KIyBURklERiB3ZWlnaHRzCnR3ZWV0c190aWR5ICU8PiUKICBhZGRfY291bnQoSUQsIHdvcmQpICU+JQogIGRpc3RpbmN0KElELCB3b3JkLCAua2VlcF9hbGwgPSBUUlVFKSAlPiUKICBiaW5kX3RmX2lkZih0ZXJtID0gd29yZCwKICAgICAgICAgICAgICBkb2N1bWVudCA9IElELAogICAgICAgICAgICAgIG4gPSBuKQpgYGAKCgpgYGB7cn0KIyBURklERiB0b3B3b3Jkcwp0d2VldHNfdGlkeSAlPiUKICBjb3VudCh3b3JkLCB3dCA9IHRmX2lkZiwgc29ydCA9IFRSVUUpICU+JQogIGhlYWQoMjApCmBgYAoKIyBJbnNwZWN0aW5nCgojIyBXb3JkcyBieSBwYXJ0eSBhZmZpbGlhdGlvbgoKYGBge3J9CmxhYmVsc193b3JkcyA8LSB0d2VldHNfdGlkeSAlPiUKICBncm91cF9ieShsYWJlbHMpICU+JQogIGNvdW50KHdvcmQsIHd0ID0gdGZfaWRmLCBzb3J0ID0gVFJVRSwgbmFtZSA9ICJ0Zl9pZGYiKSAlPiUKICBzbGljZSgxOjEwMCkgJT4lCiAgdW5ncm91cCgpIApgYGAKCmBgYHtyLCBmaWcud2lkdGg9MTB9CmxhYmVsc193b3JkcyAlPiUKICBtdXRhdGUod29yZCA9IHJlb3JkZXJfd2l0aGluKHdvcmQsIGJ5ID0gdGZfaWRmLCB3aXRoaW4gPSBsYWJlbHMpKSAlPiUKICBnZ3Bsb3QoYWVzKHggPSB3b3JkLCB5ID0gdGZfaWRmLCBmaWxsID0gbGFiZWxzKSkgKwogIGdlb21fY29sKHNob3cubGVnZW5kID0gRkFMU0UpICsKICBsYWJzKHggPSBOVUxMLCB5ID0gInRmLWlkZiIpICsKICBmYWNldF93cmFwKH5sYWJlbHMsIG5jb2wgPSAyLCBzY2FsZXMgPSAiZnJlZSIpICsKICBjb29yZF9mbGlwKCkgKwogIHNjYWxlX3hfcmVvcmRlcmVkKCkKYGBgCgojIyBTZW50aW1lbnRzPwoKYGBge3J9CnNlbnRpbWVudF90d2VldCA8LSB0d2VldHNfdGlkeSAlPiUKICBpbm5lcl9qb2luKGdldF9zZW50aW1lbnRzKCJiaW5nIikpCmBgYAoKLi4uIFRvIGJlIGNvbnRpbnVlZCBieSB5b3UKCiMjIFRvd2FyZHMgcHJlZGljdGlvbj8KCmBgYHtyfQp0d2VldHNfZHRtIDwtIHR3ZWV0c190aWR5ICU+JQogIHBpdm90X3dpZGVyKG5hbWVzX2Zyb20gPSB3b3JkLCB2YWx1ZXNfZnJvbSA9IHRmX2lkZikKYGBgCgpgYGB7cn0KdHdlZXRzX2R0bSAlPD4lIG11dGF0ZShhY3Jvc3MoZXZlcnl0aGluZygpLCAuZm5zID0gfnJlcGxhY2VfbmEoLiwwKSkpIApgYGAKCgpgYGB7cn0Kcm0odHdlZXRzX2R0bSkKYGBgCgoKCg==